# cluster with euclidean dissim, ward grouping
dend <- dat_frm %>%
vegdist(method = 'euclidean') %>%
hclust(method = 'average')
# grouping vector
ngrps <- 5
grps <- cutree(dend, k = ngrps)
grps[grps == 3] <- 5
grps[grps == 4] <- 3
grps[grps == 5] <- 4
# color vector for groups
cols <- mapviewGetOption("vector.palette")(ngrps)
cols[4] <- cols[5]
cols[c(4,5)] <- 'orange'
colsd <- cols[c(1, 3, 2, 4, 5)]
p1 <- dend %>%
as.dendrogram %>%
set("labels_cex", 0.4) %>%
color_branches(k = ngrps, col = colsd) %>%
color_labels(k = ngrps, col = colsd)
jpeg(here('figs', 'siteclsts.jpg'), height = 4.5, width = 10, units = 'in', res = 500, family = 'serif')
plot(p1, ylab = 'Dissimilarity', xlab = 'Monitoring stations (155)')
dev.off()
knitr::include_graphics(here('figs', 'siteclsts.jpg'))
Figure 1.1: Dendrogram showing sites clustering based on a dissimilarity matrix of Euclidean distances for the measured variables. The tree was cut to identify site groupings within which relative values of the measured variables were similar. Sites closer to each other at the terminal branches are more similar. Sites distribution: group 1, 12 sites; group 3, 20; group 2, 107; group 4, 14.
cols <- cols[1:4]
toplo <- dat_frm %>%
mutate(Groups = as.character(grps))
p <- ggpairs(toplo, mapping = ggplot2::aes(fill = Groups, colour = Groups))
for (row in seq_len(p$nrow))
for (col in seq_len(p$ncol))
p[row, col] <- p[row, col] + scale_colour_manual(values = cols) + scale_fill_manual(values = cols)
# circlize_dendrogram(p1)
jpeg(here('figs', 'prplts.jpg'), height = 7, width = 7, units = 'in', res = 500, family = 'serif')
p
dev.off()
knitr::include_graphics(here('figs', 'prplts.jpg'))
Figure 1.2: A generalized pairs plot displaying paired combinations of categorical and quantitative/continuous variables (Emerson et al. 2013).
cols <- cols[1:4]
ppp <- PCA(dat_frm, scale.unit = F, graph = F)
p1 <- ggord(ppp, grp_in = as.character(grps), vec_ext = 3, alpha = 0.8, size = 2, txt = 4, arrow = 0.2, cols = cols) +
mythm
jpeg(here('figs', 'pcanuts.jpg'), height = 5, width = 5, units = 'in', res = 500, family = 'serif')
p1
dev.off()
knitr::include_graphics(here('figs', 'pcanuts.jpg'))
Figure 1.3: Evaluating site groupings in Fig. 1 and the variance among measured variables using principal components analysis. Sites with similar characters are closer to each other in the ordination plot. Included are the measured variables related to site groupings as two-dimensional vectors.
# metals clustering
set.seed(1234)
ngrps <- 4
grps <- kmeans(dat_frm_mets, centers = ngrps, nstart = 10, iter.max = 30) %>%
.$cluster
# metals cluster colors
cols <- mapviewGetOption("vector.palette")(ngrps)
cols[4] <- 'orange'
colgrp <- data.frame(
grps = c(1:ngrps),
cols = cols,
stringsAsFactors = F
)
# data for summary plot
toplo <- dat_frm_mets %>%
mutate(grps = grps) %>%
left_join(colgrp, by = 'grps') %>%
mutate(grps = factor(grps)) %>%
gather('var', 'val', -grps, -cols) %>%
group_by(grps, cols, var) %>%
summarise(
med = quantile(val, probs = 0.5, na.rm = T),
min = quantile(val, probs = 0.05, na.rm = T),
max = quantile(val, probs = 0.95, na.rm = T)
) %>%
ungroup() %>%
mutate(var = fct_relevel(var, 'Amph. surv.', 'Embr. surv.'))
p <- ggplot(toplo, aes(x = grps, y = med)) +
geom_errorbar(aes(ymin = min, ymax = max), width = 0) +
geom_point(pch = 21, fill = toplo$cols, size = 3) +
facet_wrap(~var, ncol = 5) +
mythm +
scale_y_continuous('Standardized measurements') +
scale_x_discrete('Groups')
jpeg(here('figs', 'metsums.jpg'), height = 7, width = 4.5, units = 'in', res = 500, family = 'serif')
p
dev.off()
knitr::include_graphics(here('figs', 'metsums.jpg'))
Figure 1.4: Summaries (median and 5th/95th percentiles) of toxicity measurements and metal concentrations among the groups defined with cluster analysis. The y-axis is standardized for relative comparisons among groups and constituents.
ppp <- PCA(dat_frm_mets, scale.unit = F, graph = F)
mythm2 <- theme_bw(base_family = 'serif') +
theme(
legend.position = 'none',
strip.background = element_blank()
)
p1 <- ggord(ppp, grp_in = as.character(grps), vec_ext = 9, alpha = 0.8, size = 2, txt = 4, arrow = 0.2, cols = cols, coord_fix = F) +
mythm2
p2 <- ggord(ppp, grp_in = as.character(grps), axes = c('2', '3'), vec_ext = 9, alpha = 0.8, size = 2, txt = 4, arrow = 0.2, cols = cols, coord_fix = F) +
mythm2
p3 <- ggord(ppp, grp_in = as.character(grps), axes = c('1', '3'), vec_ext = 9, alpha = 0.8, size = 2, txt = 4, arrow = 0.2, cols = cols, coord_fix = F)
pleg <- g_legend(p3)
p3 <- p3 + mythm2
jpeg(here('figs', 'pcamets.jpg'), height = 11, width = 5.5, units = 'in', res = 500, family = 'serif')
{p1 + p2 + p3 + plot_layout(ncol = 1)} -
pleg + plot_layout(ncol = 2, widths = c(1, 0.2))
dev.off()
knitr::include_graphics(here('figs', 'pcamets.jpg'))
Figure 1.5: Site groupings from the k-means clusters and variance among the measured variables using principal components analysis. The first three PCA axes are shown, with each plot showing a different combination of axes.
tomod <- dat_frm_mets %>%
mutate(grps = factor(grps)) %>%
rename(
amph_surv = `Amph. surv.`,
embr_surv = `Embr. surv.`
)
set.seed(123)
tmp <- randomForest(grps ~ .,
data = tomod,
importance = TRUE,
ntree = 1000, na.action = na.omit)
# plos for variable importance groupings
plos <- tmp$importance %>%
as.data.frame %>%
rownames_to_column('var') %>%
dplyr::select(-MeanDecreaseAccuracy, -MeanDecreaseGini) %>%
gather('grp', 'importance', -var) %>%
group_by(grp) %>%
nest %>%
mutate(
implo = pmap(list(as.character(grp), data), function(grp, data){
colfl <- colgrp %>%
filter(grps %in% grp) %>%
pull(cols)
toplo <- data %>%
arrange(-importance) %>%
.[1:10, ] %>%
mutate(var = factor(var, levels = var))
p <- ggplot(toplo, aes(x = var, y = importance)) +
geom_segment(aes(xend = var, yend = 0)) +
geom_point(fill = colfl, pch = 21, size = 3) +
theme_bw() +
theme(
axis.title.x = element_blank(),
# axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title.y = element_blank()
) +
ggtitle(grp) +
scale_y_continuous(limits = c(0, 0.17), labels = scales::percent) +
coord_flip()
return(p)
})
)
# variable importance for toxicity
plos2 <- dat_frm_mets %>%
gather('var', 'val', `Amph. surv.`, `Embr. surv.`) %>%
group_by(var) %>%
nest %>%
mutate(
implo = pmap(list(var, data), function(var, data){
tmp <- randomForest(val ~ .,
data = data,
importance = TRUE,
ntree = 1000, na.action = na.omit)
toplo <- tmp$importance %>%
as.data.frame %>%
rownames_to_column('var') %>%
dplyr::select(-IncNodePurity) %>%
rename(importance = `%IncMSE`) %>%
arrange(-importance) %>%
.[1:10, ] %>%
mutate(var = factor(var, levels = var))
p <- ggplot(toplo, aes(x = var, y = importance)) +
geom_segment(aes(xend = var, yend = 0)) +
geom_point(fill = 'grey', pch = 21, size = 3) +
theme_bw() +
theme(
axis.title.x = element_blank(),
# axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title.y = element_blank()
) +
ggtitle(var) +
scale_y_continuous(limits = c(0, 0.41), labels = scales::percent) +
coord_flip()
return(p)
})
)
p1 <- plos$implo[[1]]
p2 <- plos$implo[[2]]
p3 <- plos$implo[[3]]
p4 <- plos$implo[[4]]
p5 <- plos2$implo[[1]]
p6 <- plos2$implo[[2]]
jpeg(here('figs', 'grpimp.jpg'), height = 7.5, width = 7.5, units = 'in', res = 500, family = 'serif')
grid.arrange(
arrangeGrob(p1, p2, p3, p4, ncol = 4, top = '(a) Group variable importance'),
arrangeGrob(p5, p6, ncol = 2, top = '(b) Toxicity variable importance'),
left = grid::textGrob('Importance (% increase MSE)', rot = 90),
ncol = 1, heights = c(1, 1)
)
dev.off()
knitr::include_graphics(here('figs', 'grpimp.jpg'))
Figure 1.6: Variable importance estimates from random forest models describing (a) group clusters pr (b) differences among sites for toxicity test results. Importance estimates are based on the increase in mean square error (MSE) for excluding a specific variable from the random forest model.
knitr::include_graphics(here('figs', 'sitemap.jpg'))
Figure 1.7: Map showing site locations color-coded by clusters. A: Ventura-Oxnard, B: Marina Del Rey, C: Port of Los Angeles (San Pedro), D: Costa Mesa, E: Dana Point, F: Oceanside, G: Carlsbad-Encinitas-Solana Beach, H: San Diego Bay.